import torch
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available(): # for nvidia GPUs etc.
device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
device = torch.device('mps')
else:
device = torch.device('cpu')
print(device)
# Load the fine-tuned model (after training)
from transformers import AutoModelForMaskedLM, AutoConfig, DistilBertTokenizerFast
dir_model = "./model-elsevier (lr 5e-5)"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
config = AutoConfig.from_pretrained(dir_model, output_hidden_states=True)
model_elsevier = AutoModelForMaskedLM.from_pretrained(dir_model, config=config).to(device)
mps
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def get_embeddings(vocab, model, batch_size=100):
tokenized_words = tokenizer(vocab, return_tensors="pt",
padding=True, truncation=True).to(device)
# Split tokenized words into batches
token_batches = [{'input_ids': tokenized_words["input_ids"][i:i + batch_size],
'attention_mask': tokenized_words["attention_mask"][i:i + 100]} for i in range(0, len(tokenized_words["input_ids"]), batch_size)]
# Initialize an empty list to store the embeddings
all_word_embeddings = []
# Process each batch
for batch_tokens in tqdm(token_batches):
# Forward pass to get embeddings for the current batch
with torch.no_grad():
outputs = model(input_ids=batch_tokens["input_ids"],
attention_mask=batch_tokens["attention_mask"])
# Obtain embeddings for the current batch
batch_embeddings = outputs.hidden_states[0].cpu().numpy()
# Append to the list
all_word_embeddings.append(batch_embeddings)
# Concatenate the embeddings from all batches
all_word_embeddings_concat = np.concatenate(all_word_embeddings, axis=0)
# Take the mean of embeddings for each word
all_word_embeddings_concat = np.mean(all_word_embeddings_concat, axis=1)
return scaler.fit_transform(all_word_embeddings_concat)
# Get the vocabulary
unique_tokens_elsevier = list(tokenizer.get_vocab())
all_word_embeddings_elsevier = get_embeddings(unique_tokens_elsevier,
model_elsevier)
print(all_word_embeddings_elsevier.shape)
100%|█████████████████████████████████████████| 306/306 [00:21<00:00, 13.93it/s]
(30522, 768)
# Load reddit model
dir_model = "./model-redit(lr 5e-5)"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
config = AutoConfig.from_pretrained(dir_model, output_hidden_states=True)
model_reddit = AutoModelForMaskedLM.from_pretrained(dir_model, config=config).to(device)
# Get the vocabulary
unique_tokens_reddit = list(tokenizer.get_vocab())
all_word_embeddings_reddit = get_embeddings(unique_tokens_reddit,
model_reddit)
print(all_word_embeddings_reddit.shape)
100%|█████████████████████████████████████████| 306/306 [00:21<00:00, 14.42it/s]
(30522, 768)
# Read the file and extract words and vectors
with open('vectors.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# Get glove vector from file txt
glove_vectors = {}
for line in tqdm(lines):
parts = line.split()
word = parts[0]
vector = np.array([float(x) for x in parts[1:]])
glove_vectors[word] = vector
# Extract vectors
unique_tokens_glove = list(glove_vectors.keys())
# Convert the list of vectors to a NumPy array and normalize
all_word_embeddings_glove = scaler.fit_transform(np.array(list(glove_vectors.values())))
print(all_word_embeddings_glove.shape)
100%|████████████████████████████████| 100001/100001 [00:07<00:00, 13382.02it/s]
(100001, 768)
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_elsevier = tsne.fit_transform(all_word_embeddings_elsevier)
# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d_elsevier[:, 0], embeddings_2d_elsevier[:, 1], s=5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
# Perform same 2d plot to compare
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_reddit = tsne.fit_transform(all_word_embeddings_reddit)
# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d_reddit[:, 0], embeddings_2d_reddit[:, 1], s=5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Perform same 2d plot to compare
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_glove = tsne.fit_transform(all_word_embeddings_glove)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(all_word_embeddings_glove[:, 0], all_word_embeddings_glove[:, 1], s=5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
Lets load a dataset with categorioes in order to get more information from plots and later evaluate the different embeddings we obtained.
import pandas as pd
categories_df = pd.read_csv('https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-categorization/monolingual/en/ap.csv',
index_col=0)
categories = {}
for category in set(categories_df["category"]):
categories[category] = categories_df.loc[categories_df["category"] == category, "word"].dropna().values
categories["animal"]
array(['bear', 'bull', 'camel', 'cat', 'cow', 'deer', 'dog', 'elephant',
'horse', 'kitten', 'lion', 'monkey', 'mouse', 'oyster', 'puppy',
'rat', 'sheep', 'tiger', 'turtle', 'zebra'], dtype=object)
# Plot the embeddings with a category in red
category = "feeling"
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d_elsevier[:, 0], embeddings_2d_elsevier[:, 1], s=5, c='lightblue') # s is the marker size
for word, (x, y) in zip(unique_tokens_elsevier, embeddings_2d_elsevier):
if word in categories[category]:
index = unique_tokens_elsevier.index(word)
scatter = plt.scatter(x, y, s=5, c='red', label=word)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
# Perform t-SNE with 3 dimensions
tsne = TSNE(n_components=3, random_state=42)
embeddings_3d_elsevier = tsne.fit_transform(all_word_embeddings_elsevier)
import plotly.express as px
fig = px.scatter_3d(x=embeddings_3d_elsevier[:, 0],
y=embeddings_3d_elsevier[:, 1],
z=embeddings_3d_elsevier[:, 2])
fig.update_traces(marker_size = 2)
fig.show()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
As you can see in the previous plot, there is a huge accumulation and some outliers. The outliers are making it difficult to visualize the accumulation, so lets remove them.
import numpy as np
from scipy.stats import zscore
# Calculate z-scores for each dimension
z_scores = zscore(embeddings_3d_elsevier, axis=0)
# Set a threshold for z-scores to identify outliers (adjust as needed)
threshold = 2.5
# Identify indices of non-outliers
non_outlier_indices = np.all(np.abs(z_scores) < threshold, axis=1)
# Filter data to exclude outliers
filtered_embeddings = embeddings_3d_elsevier[non_outlier_indices]
fig = px.scatter_3d(x=filtered_embeddings[:, 0],
y=filtered_embeddings[:, 1],
z=filtered_embeddings[:, 2])
fig.update_traces(marker_size = 2)
fig.show()
# Plot in red a category
import plotly.graph_objects as go
category = "feeling"
fig = px.scatter_3d(x=filtered_embeddings[:, 0],
y=filtered_embeddings[:, 1],
z=filtered_embeddings[:, 2],
opacity=0.5)
# Change the color of specific words
for word, (x, y, z) in zip(unique_tokens_elsevier, embeddings_3d_elsevier):
if word in categories[category]:
fig.add_trace(
go.Scatter3d(x=[x],
y=[y],
z=[z],
mode="markers",
marker=dict(
size=12,
color="red")))
fig.update_traces(marker_size = 2)
fig.update_layout(showlegend=False)
fig.show()
In this 3d representation, the categories are more grouped together.
In order to evaluate the embeddings, lets create a function to get the first n words given an analogy. For example, if woman-men, queen-?, which n words will be fit the analogy. In other words, if we calculate the distance between woman and meen in the vector representation and we move this distance from queen, we should get king.
def get_embeddings_word(word, vocab, embeddings, method="transformer"):
if method == "glove":
return glove_vectors[word]
return embeddings[vocab.index(word)]
from sklearn.metrics.pairwise import cosine_similarity
def get_analogy(w1, w2, w3, vocab, embeddings, n, method="transformer"):
embedding_w1 = get_embeddings_word(w1, vocab, embeddings, method=method)
embedding_w2 = get_embeddings_word(w2, vocab, embeddings, method=method)
embedding_w3 = get_embeddings_word(w3, vocab, embeddings, method=method)
analogy_vector = embedding_w1 - embedding_w2
embedding_result = embedding_w3 + analogy_vector
# Compute cosine similarity between embedding_king_prime and all other word embeddings
similarities = cosine_similarity([embedding_result], embeddings)
# Get the indices of the top X most similar words
top_indices = similarities.argsort()[0][-1-n:][::-1]
# Retrieve the top X words
return [vocab[i] for i in top_indices if vocab[i] != w3]
get_analogy("woman", "man", "king", unique_tokens_elsevier, all_word_embeddings_elsevier, 3)
['queen', 'woman', 'kings']
get_analogy("woman", "man", "king",
unique_tokens_glove, all_word_embeddings_glove, n=5, method="glove")
['woman', 'floating_adj', 'earmarked', 'dedham', 'swollen_verb']
Lets load a analogies dataset and evaluate our embeddings
import pandas as pd
analogy_df = pd.read_csv('https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-analogy/monolingual/en/google-analogies.csv',
index_col=0)
def eval_analogies(vocab, embeddings, x=len(analogy_df), method="glove"):
punct = 0
for i in tqdm(range(len(analogy_df[:x]))):
if all([word.lower() in vocab for word in analogy_df.loc[i, ["word1", "word2", "word3", "target"]].values]):
target = analogy_df.loc[i, "target"].lower()
similar_words = get_analogy(analogy_df.loc[i, "word1"].lower(),
analogy_df.loc[i, "word2"].lower(),
analogy_df.loc[i, "word3"].lower(),
vocab,
embeddings, 5)
similar_words.reverse()
if target in similar_words:
punct = punct + similar_words.index(target) + 1
return punct
analogies_elsevier = eval_analogies(unique_tokens_elsevier, all_word_embeddings_elsevier)
analogies_elsevier
100%|█████████████████████████████████████| 19544/19544 [08:08<00:00, 39.98it/s]
19267
analogies_reddit = eval_analogies(unique_tokens_reddit, all_word_embeddings_reddit)
analogies_reddit
100%|█████████████████████████████████████| 19544/19544 [08:23<00:00, 38.83it/s]
16258
analogies_glove = eval_analogies(unique_tokens_glove, all_word_embeddings_glove,
x=1000, method="glove")
analogies_glove
100%|███████████████████████████████████████| 1000/1000 [02:42<00:00, 6.16it/s]
0
from sklearn.metrics.pairwise import cosine_similarity
def predict_category(word, vocab, embeddings, threshold=0.7):
# Compute category centroids
category_centroids = {
category: get_category_centroids(words, vocab, embeddings)
for category, words in categories.items()
}
max_similarity = 0
if word in vocab:
word_embedding = get_embeddings_word(word, vocab, embeddings)
max_category = {}
for category in category_centroids:
similarities = cosine_similarity([word_embedding], category_centroids[category])
max_category[category] = similarities.max()
max_similarity = max(max_category.values())
if max_similarity > threshold:
predicted_category = max(max_category, key=max_category.get)
return predicted_category
else:
return "Uncategorized"
def get_category_centroids(words, vocab, embeddings):
category_embedding = []
for word in words:
if word in vocab:
category_embedding.append(get_embeddings_word(word, vocab, embeddings))
return np.vstack(category_embedding)
# Usage:
word_to_predict = "elephant"
predicted_category = predict_category(word_to_predict, unique_tokens_elsevier, all_word_embeddings_elsevier)
print(f"The predicted category for '{word_to_predict}' is '{predicted_category}'.")
The predicted category for 'elephant' is 'animal'.
def eval_categories(vocab, embeddings):
punct = 0
for i in tqdm(range(len(categories_df))):
if categories_df.loc[i, "category"] != float("nan") or categories_df.loc[i, "word"] != float("nan"):
observed_category = categories_df.loc[i, "category"].lower()
predicted_category = predict_category(categories_df.loc[i, "word"],
vocab,
embeddings)
if observed_category == predicted_category:
punct = punct + 1
return punct
categories_elsevier = eval_categories(unique_tokens_elsevier, all_word_embeddings_elsevier)
categories_elsevier
100%|█████████████████████████████████████████| 423/423 [00:38<00:00, 11.10it/s]
285
categories_reddit = eval_categories(unique_tokens_reddit, all_word_embeddings_reddit)
categories_reddit
100%|█████████████████████████████████████████| 423/423 [00:38<00:00, 10.95it/s]
285
def get_embeddings_concept(words, vocab, all_embeddings):
embeddings_words = np.empty((0, 768))
for word in words:
embeddings_word = get_embeddings_word(word, vocab, all_embeddings)
embeddings_words = np.vstack([embeddings_words, embeddings_word])
# Take the mean of embeddings
return words, np.mean(embeddings_words, axis=0)
embedding_elsevier_losses = get_embeddings_concept(["losses", "loss", "lost", "losing", "lose", "loses"],
unique_tokens_elsevier, all_word_embeddings_elsevier)
embedding_reddit_losses = get_embeddings_concept(["losses", "loss", "lost", "losing", "lose", "loses"],
unique_tokens_reddit, all_word_embeddings_reddit)
vector_elsevier = embedding_elsevier_losses[1].reshape(1, -1)
vector_reddit = embedding_reddit_losses[1].reshape(1, -1)
cosine_similarity_score = cosine_similarity(vector_reddit, vector_elsevier)
print("Cosine Similarity:", cosine_similarity_score[0][0])
from numpy.linalg import norm
A=embedding_elsevier_losses[1]
B=embedding_reddit_losses[1]
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)
dot_p = np.dot(A,B)
print("Dot product :", dot_p)
Cosine Similarity: 0.9639859542835594 Cosine Similarity: 0.9639859542835584 Dot product : 197.98296380900612
import string
def get_similar_words(aggregate_embeddings, vocab, all_embeddings, n):
words = aggregate_embeddings[0]
# Compute cosine similarity between embedding_king_prime and all other word embeddings
similarities = cosine_similarity([aggregate_embeddings[1]], all_embeddings)
# Get the indices of the top X most similar words
indices = similarities.argsort()[0][:][::-1]
similar_words = []
i = 0
while len(similar_words) != n:
similar_word = vocab[indices[i]]
if any(char in string.ascii_lowercase for char in similar_word) and similar_word not in words:
similar_words.append(similar_word)
i = i + 1
return similar_words
get_similar_words(embedding_elsevier_losses,
unique_tokens_elsevier,
all_word_embeddings_elsevier, 10)
['regained', 'regaining', 'casualties', 'relinquished', 'lineman', 'ceded', 'regain', 'scowled', 'injuring', 'victories']
get_similar_words(embedding_reddit_losses,
unique_tokens_reddit, all_word_embeddings_reddit, 10)
['regained', 'regain', 'regaining', 'ceded', 'retain', 'gained', 'gaining', 'relinquished', 'destroys', 'retains']
embedding_glove_losses = get_embeddings_concept(["losses", "loss", "lost", "losing", "lose", "loses"],
unique_tokens_glove, all_word_embeddings_glove)
get_similar_words(embedding_glove_losses,
unique_tokens_glove, all_word_embeddings_glove, 10)
['<unk>', 'limping', 'teddy_noun', 'lobed_verb', 'superannuation_noun', 'beekman', 'meddle', 'participants_noun', 'balked_verb', 'kk_noun']
related_words = ["pay", "payment", "tax", "taxes", "gamble",
"monetary", "money", "purchase", "trade",
"trading", "gain", "grief", "negative", "sentimental"]
def get_distance_from_concept(words, concept_embeddings, vocab, embeddings):
distance = {}
for word in words:
embeddings_word = get_embeddings_word(word, vocab, embeddings)
cosine_similarity_score = cosine_similarity(embeddings_word.reshape(1, -1),
concept_embeddings.reshape(1, -1))
dot_p = np.dot(embeddings_word, concept_embeddings)
distance[word] = (cosine_similarity_score[0][0], dot_p)
return distance
distance_concept_elsevier = get_distance_from_concept(related_words, embedding_elsevier_losses[1],
unique_tokens_elsevier, all_word_embeddings_elsevier)
distance_concept_elsevier
{'pay': (0.976685450134017, 202.48150320327534),
'payment': (0.9795798593239768, 201.03050568760682),
'tax': (0.9769047155841063, 202.31815508921716),
'taxes': (0.9801746978088899, 200.84567882997297),
'gamble': (0.9814993664983425, 201.71005262240448),
'monetary': (0.9791837669040321, 200.97352997847997),
'money': (0.9766291317097666, 202.51230657035964),
'purchase': (0.9788920054119846, 201.54759388989274),
'trade': (0.9777536996312579, 202.98090830094583),
'trading': (0.9779334863416884, 201.5486211991196),
'gain': (0.9834219178256203, 203.47531728225005),
'grief': (0.9854000723591345, 201.52914524830607),
'negative': (0.9792852335941129, 202.35542841652358),
'sentimental': (0.9867355109991538, 202.91185808372697)}
distance_concept_reddit = get_distance_from_concept(related_words, embedding_reddit_losses[1],
unique_tokens_reddit, all_word_embeddings_reddit)
distance_concept_reddit
{'pay': (0.9783330549429938, 202.84510952451416),
'payment': (0.9792008952505205, 202.11917536586157),
'tax': (0.9783460494492402, 202.89668067797345),
'taxes': (0.9804420284383639, 201.24236903036586),
'gamble': (0.9822085261322759, 202.98225621427594),
'monetary': (0.9805221310226453, 201.69854234479106),
'money': (0.9784580044630313, 201.92124390420213),
'purchase': (0.9800533481079252, 202.4430386701623),
'trade': (0.979102189727739, 203.17395386707813),
'trading': (0.9792906392765106, 202.20434713114173),
'gain': (0.984792278270893, 203.58430323622918),
'grief': (0.9826914948523917, 202.28639495461636),
'negative': (0.9808707301004688, 202.54485887176503),
'sentimental': (0.9822289429984885, 202.32761765047832)}
keys = list(distance_concept_reddit.keys())
cosine_dist_reddit = [value[0] for value in list(distance_concept_reddit.values())]
dot_product_reddit = [value[1] for value in list(distance_concept_reddit.values())]
cosine_dist_elsevier = [value[0] for value in list(distance_concept_elsevier.values())]
dot_product_elsevier = [value[1] for value in list(distance_concept_elsevier.values())]
plt.figure(figsize=(10,6))
plt.barh(np.arange(len(keys))-0.2, cosine_dist_reddit, height= 0.4, label="Reddit")
plt.barh(np.arange(len(keys))+0.2, cosine_dist_elsevier, height= 0.4, label="Elsevier")
plt.yticks(np.arange(len(keys)), keys)
plt.xlim((0.975, 0.99))
plt.legend()
plt.show()
plt.figure(figsize=(10,6))
plt.barh(np.arange(len(keys))-0.2, dot_product_reddit, height= 0.4, label="Reddit")
plt.barh(np.arange(len(keys))+0.2, dot_product_elsevier, height= 0.4, label="Elsevier")
plt.yticks(np.arange(len(keys)), keys)
plt.xlim((200, 205))
plt.legend()
plt.show()